Deliverables: • Perform basic data pre-processing (if needed), univariate and bivariate analysis. Use relevant visualizations to understand the features at hand. Which features are strongly correlated to the target variable? - 15 • Build a pruned decision tree model and present the evaluation metrics. – 15 • Build all the ensemble models taught as a part of the curriculum and compare the models. - 20 • Present the model comparison in a data frame. - 5 • Comment on the codes and provide detailed explanation of the steps followed. – 5
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer #DT does not take strings as input for the model fit step....
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from IPython.display import Image
#import pydotplus as pydot
from sklearn import tree
from os import system
#from yellowbrick.classifier import ClassificationReport, ROCAUC
plt.style.use('ggplot')
pd.options.display.float_format = '{:,.5f}'.format
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
#Load data
PKS_df = pd.read_csv('C:\\Users\\garrettikekhua\\Downloads\\PGD-AIML\\Program Content\\Ensemble Techniques\\Project Ensemble Techniques - Term Deposit Subscription Prediction\\parkinsons.data')
PKS_df.head(10)
PKS_df.shape
PKS_df.dtypes
#The 'name' Column is not important so, we will drop it
PKS_df_nw1 = PKS_df.drop(['name'], axis=1)
PKS_df_nw1.head()
PKS_df_nw1.describe().transpose()
#plot the graphs of different variable to see the distributions.
#import matplotlib.pyplot as plt
#import seaborn as sns
# There are more positive than negatiive cases in the Data
sns.countplot(PKS_df_nw1['status'])
sns.distplot(PKS_df_nw1['MDVP:Fo(Hz)'])
sns.distplot(PKS_df_nw1['MDVP:Fhi(Hz)'])
sns.distplot(PKS_df_nw1['MDVP:Flo(Hz)'])
# There are no outliners
sns.boxplot(PKS_df_nw1['MDVP:Fo(Hz)']);
# There are a lot of outliners
sns.boxplot(PKS_df_nw1['MDVP:Fhi(Hz)']);
# There are some outliners
sns.boxplot(PKS_df_nw1['MDVP:Flo(Hz)']);
# Histogram of the 24 columns
columns = list(PKS_df_nw1)[0:23] # Excluding Outcome column which has only
PKS_df_nw1[columns].hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2));
sns.pairplot(PKS_df_nw1,diag_kind='1')
plt.figure(figsize=(20,10))
sns.heatmap(PKS_df_nw1.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.show()
# Jitter-DDP & MDVP-RAP are perfectly co-related
# Shimmer-DDA & Shimmer-APQ3 are perfectly co-related
# There are some others that have high co-relation (spread1/PPE, MDVP-Jitter(%)/Jitter-DDP) and few others
PKS_df_nw1.info() # All values are of type 'float64'
#Split Data
X = PKS_df_nw1.drop("status" , axis=1)
y = PKS_df_nw1.pop("status")
# splitting data into training and test set for independent attributes
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=2)
X_train.shape,X_test.shape
print("{0:0.2f}% data is in training set".format((len(X_train)/len(PKS_df_nw1.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(X_test)/len(PKS_df_nw1.index)) * 100))
#1. First create models using Logistic Regression and Decision Tree algorithm.
#from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
#from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver="liblinear", random_state=2)
logreg.fit(X_train, y_train)
coef_df = pd.DataFrame(logreg.coef_)
coef_df['intercept'] = logreg.intercept_
print(coef_df)
## function to get confusion matrix in a proper format
def draw_cm( actual, predicted ):
cm = confusion_matrix( actual, predicted)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
y_predict = logreg.predict(X_test)
print("Trainig accuracy",logreg.score(X_train,y_train))
print()
print("Testing accuracy",logreg.score(X_test, y_test))
print()
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#AUC ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Parkinson DPM')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
#Build Decision Tree Model
#GINI
# We will build our model using the DecisionTreeClassifier function with the 'gini' method of split
PKS_tree_gini = DecisionTreeClassifier(criterion = 'gini', random_state=2)
PKS_tree_gini.fit(X_train, y_train)
print("Train: %.2f" % PKS_tree_gini.score(X_train, y_train)) # performance on train data
print("Test: %.2f" % PKS_tree_gini.score(X_test, y_test)) # performance on test data
#There sames to be low degree of overfitting in the model so, the test accuracy is about 76%.
#But we will Prune to try and improve it.
PKS_pruned = DecisionTreeClassifier(criterion = "gini", max_depth=3)
PKS_pruned.fit(X_train, y_train)
print("Train: %.2f" % PKS_pruned.score(X_train, y_train)) # performance on train data
print("Test: %.2f" % PKS_pruned.score(X_test, y_test)) # performance on test data
# We tried different max_depth of 2,3,4,5 to prune
# With a max_depth of 3 we got the better test score of 80%
preds_train = PKS_pruned.predict(X_train)
preds_test = PKS_pruned.predict(X_test)
acc_DT = accuracy_score(y_test, preds_test)
# Confusion matrix
pd.crosstab(y_test, preds_test, rownames=['Actual'], colnames=['Predicted'])
print(PKS_pruned.score(X_test , y_test))
y_predict = PKS_pruned.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.DataFrame({'Method':['Decision Tree'], 'accuracy': acc_DT})
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
#Logistic Regression has on Test : 0.81
#Decision Tree has on Test : 0.80
#Apply the Random forest model and print the accuracy of Random forest Model
#from sklearn.ensemble import RandomForestClassifier
PKS_rfcl = RandomForestClassifier(n_estimators = 50)
PKS_rfcl = PKS_rfcl.fit(X_train, y_train)
pred_RF = PKS_rfcl.predict(X_test)
acc_RF = accuracy_score(y_test, pred_RF)
y_predict = PKS_rfcl.predict(X_test)
print(PKS_rfcl.score(X_test, y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'accuracy': [acc_RF]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
#Apply Bagging Classifier Algorithm and print the accuracy.
#from sklearn.ensemble import BaggingClassifier
PKS_bgcl = BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=2)
PKS_bgcl = PKS_bgcl.fit(X_train, y_train)
pred_BG = PKS_bgcl.predict(X_test)
acc_BG = accuracy_score(y_test, pred_BG)
y_predict = PKS_bgcl.predict(X_test)
print(PKS_bgcl.score(X_test , y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': [acc_BG]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
#Apply Adaboost Ensemble Algorithm for the same data and print the accuracy.
#from sklearn.ensemble import AdaBoostClassifier
PKS_abcl = AdaBoostClassifier(n_estimators = 100, learning_rate=0.1, random_state=2)
PKS_abcl = PKS_abcl.fit(X_train, y_train)
pred_AB =PKS_abcl.predict(X_test)
acc_AB = accuracy_score(y_test, pred_AB)
y_predict = PKS_abcl.predict(X_test)
print(PKS_abcl.score(X_test , y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
tempResultsDf = pd.DataFrame({'Method':['Adaboost'], 'accuracy': [acc_AB]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
#Apply GradientBoost Classifier Algorithm for the same data and print the accuracy
#from sklearn.ensemble import GradientBoostingClassifier
PKS_gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=2)
PKS_gbcl = PKS_gbcl.fit(X_train, y_train)
pred_GB = PKS_gbcl.predict(X_test)
acc_GB = accuracy_score(y_test, pred_GB)
y_predict = PKS_gbcl.predict(X_test)
print(PKS_gbcl.score(X_test, y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
tempResultsDf = pd.DataFrame({'Method':['Gradient Boost'], 'accuracy': [acc_GB]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
# Let us look at the different model accuracy in a DF
resultsDf
# We tried different max_depth of 2,3,4,5 to prune
# With a max_depth of 3 we got the better test score of 80%
# For this dataset, Random Forest model gives the best result on test predictions with (81.3%).
# Decision Tree, Adaboost & Gradient Boost rank second with (79.6%)
# Bagging is the lowest with (76.2%)